This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/. RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix. Additional comments about RRBS data:

  1. We set dummy variables for each vial label
  2. Condition/group dummy variables have 1 only for the “Me” data
  3. We estimate the common dispersion parameter once on all the data
  4. Library sizes are computed by looking and Me and Un for each sample (keep.lib.sizes=FALSE causes the library sizes to be recomputed after applying a filter)

This paper uses glmFit-based pipeline, whereas others recommend using glmQFit as it is more accurate for controlling type 1 error - see comments in https://support.bioconductor.org/p/76790/#:~:text=glmQLFit%20uses%20the%20trended%20NB,a%20GLM%20%2D%20and%20that’s%20it

Specify parameters

# edgeR's tolerance parameter: default is 1e-06, but this seems to be too slow
edger_tol = 1e-05

# Specify the parameters required for the analysis
# This dir should have the motrpac-bic-norm-qc repo
repo_local_dir = "~/Desktop/repos/"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "~/Desktop/MoTrPAC/data/pass_1b/rrbs/"
# Bucket structure is: tissue/platform/results/<data files>. For GET, this path should
# also have a qa_qc directory with the qc_metrics and sample_metadata files.
# bucket = "gs://motrpac-release-data-staging/Results/pass1b-06/epigenomics/"
bucket = "gs://motrpac-external-release2-data-staging/Results/pass1b-06/epigenomics/"
# Specify bucket and local path for the phenotypic data
# this path is internal to BIC - faster
pheno_bucket = "gs://bic_data_analysis/pass1b/pheno_dmaqc/merged2021-03-22/"
pheno_local_dir = "~/Desktop/MoTrPAC/data/pass_1b/dmaqc_pheno/"

# we require a site to have a total count
# (both methylated and unmethylated) of at least min_count_coverage in 
# at least min_per_samples * (#samples); 
# specifying min_per_samples=1 means all samples
min_count_coverage = 8
min_per_samples = 1
# parameters for defining promoters: 
# define region sizes around TSS per gene
map_to_promoters=T
promoter_coord = c(-1000,2000)
# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
OUTLIER_IQR_THR = 5

# output path
data_out_bucket = "gs://mawg-data/pass1b-06/epigen-rrbs/data/"
dea_out_bucket = "gs://mawg-data/pass1b-06/epigen-rrbs/dea/"
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("reads","pct_Unaligned","pct_GC","pct_chrM")
biospec_cols = c("registration.sex","key.anirandgroup",
                 "registration.batchnumber",
                 "training.staffid",
                  "is_control",
                  "vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
                  "terminal.weight.mg","time_to_freeze",
                  "timepoint","bid","pid")
# load functions and libraries
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/config_session.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/association_analysis_methods.R"))

Get the Rat annotations:

# manage the gene annotation
# # Install the rat db
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("org.Rn.eg.db")
library("org.Rn.eg.db")
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 

Load the RRBS data from the bucket and filter low counts

This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/ Note that RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix.

First, download the data to local dir (skip this if already downloaded, these are large data tables).

obj = download_bucket_to_local_dir(bucket,local_path=local_data_dir,
            remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# Second, delete bismark files
bis_files = obj$downloaded_files[
 grepl("bismark.cov.gz",obj$downloaded_files)
]
rem_obj = sapply(bis_files,file.remove)

Now load and filter the RRBS data:

obj = list(
  downloaded_files = list.files(local_data_dir,full.names = T,recursive = T)
)
obj$downloaded_files = obj$downloaded_files[
  grepl("epigen-rrbs",obj$downloaded_files)
  ]
rdata_files = obj$downloaded_files[
  grepl(".RData$",ignore.case = T,obj$downloaded_files)
  ]
names(rdata_files) = sapply(rdata_files,GET_get_tissue_from_download_path)
tissues = names(rdata_files)
tissue2rrbs_data = list()
for (tissue in tissues){
  curr_rdata_file = rdata_files[grepl(tissue,rdata_files)]
  if(length(curr_rdata_file)!=1){
    print(paste("Warning: more than a single RData file in tissue:",tissue))
  }
  yall = get(load(curr_rdata_file))
  
  # Filtering unassembled chromosomes
  keep <- rep(TRUE, nrow(yall))
  Chr <- as.character(yall$genes$Chr)
  keep[ grep("random",Chr) ] <- FALSE
  keep[ grep("chrUn",Chr) ] <- FALSE
  # remove non-chr ones
  keep[!grepl("chr",Chr) ] <- FALSE
  # remove M chromosome (otherwise we get error when assigning annotation below)
  keep[Chr=="chrM"] <- FALSE
  print(paste("Data from tissue",tissue,"was loaded into session"))
  print(paste("how many sites were removed (unassembled+chrM)?",sum(!keep)))
  # keep.lib.sizes=FALSE causes the library sizes to be recomputed
  yall <- yall[keep,, keep.lib.sizes=FALSE]
  # fix the levels and order by chromosome
  # rat genome chromosomes:
  ChrNames <- paste0("chr",c(1:20,"X","Y"))
  #ChrNames <- paste0("chr",c(1:2))
  yall$genes$Chr <- factor(yall$genes$Chr, levels=ChrNames)
  o <- order(yall$genes$Chr, yall$genes$Locus)
  yall <- yall[o,]
  print(paste("Counts matrix dim before low counts filter:"))
  print(dim(yall))
  gc()
  
  # get locations, gene ids, etc.
  TSS <- nearestTSS(yall$genes$Chr, yall$genes$Locus, species="Rn")
  yall$genes$EntrezID <- TSS$gene_id
  yall$genes$Symbol <- TSS$symbol
  yall$genes$Strand <- TSS$strand
  yall$genes$Distance <- TSS$distance
  yall$genes$Width <- TSS$width
  #head(yall$genes)
  
  if(map_to_promoters){
  # Map data to promoters and sum over regions
      InPromoter <- yall$genes$Distance >= promoter_coord[1] & yall$genes$Distance <= promoter_coord[2]
  # We subset the CpGs to those contained in a promoter region:
      yIP <- yall[InPromoter,,keep.lib.sizes=FALSE]
  # Compute total counts
      ypr <- rowsum(yIP, yIP$genes$EntrezID, reorder=FALSE)
      yall = ypr
      gc()
  }
  
  # Remove low counts (by promoter)
  # The analysis needs to be restricted to CpG sites that have enough coverage for the
  # methylation level to be measurable in a meaningful way at that site. 
  # As a conservative rule of thumb, we require a site to have a total count
  # (both methylated and unmethylated) of at least 8 in every sample.
  Coverage <- yall$counts[,grepl("-Me",colnames(yall$counts))] + 
    yall$counts[, grepl("-Un",colnames(yall$counts))]
  gc()
  # see description of min_count_coverage and min_per_samples above
  keep <- rowSums(Coverage >= min_count_coverage) >= min_per_samples*ncol(Coverage)
  # filter the data
  y <- yall[keep,, keep.lib.sizes=FALSE]
  print(paste("Counts matrix dim after low counts filter:"))
  print(dim(y))
  
  # Data normalization
  # A key difference between BS-seq and other sequencing data is that the pair of libraries 
  # holding the methylated and unmethylated reads for a particular sample are treated as a unit.
  # To ensure that the methylated and unmethylated reads for the same sample are treated on the
  # same scale, we need to set the library sizes to be equal for each pair of libraries. 
  # We set the library sizes for each sample to be the average of the total read counts for 
  # the methylated and unmethylated libraries.
  # Other normalization methods developed for RNA-seq data, such as TMM, are not required for BS-seq data.
  TotalLibSize <- y$samples$lib.size[grepl("-Me",colnames(yall$counts))] +
    +       y$samples$lib.size[grepl("-Un",colnames(yall$counts))]
  y$samples$lib.size <- rep(TotalLibSize, each=2)
  tissue2rrbs_data[[tissue]] = y
  
  #  rm(yall);rm(y);gc()
}
## [1] "Data from tissue t52-hippocampus was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72675"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6538607     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12244   104
## [1] "Data from tissue t55-gastrocnemius was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74600"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6869469     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12387   104
## [1] "Data from tissue t58-heart was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72607"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6597238     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12376   104
## [1] "Data from tissue t59-kidney was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 73240"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6638813     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12329   104
## [1] "Data from tissue t66-lung was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 70433"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6437815     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12302   104
## [1] "Data from tissue t68-liver was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 71495"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6499281     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12304   104
## [1] "Data from tissue t69-brown-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 78814"
## [1] "Counts matrix dim before low counts filter:"
## [1] 7385104     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12443   104
## [1] "Data from tissue t70-white-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74742"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6962440     104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12357   104
# Get the pipeline qc scores
pipeline_scores_file = obj$downloaded_files[
    grepl("qa-qc-metrics",obj$downloaded_files) & 
      grepl(".csv$",obj$downloaded_files)
  ]
if(length(pipeline_scores_file)!=1){
  print("Error: RRBS data does not have a single pipeline scores file")
}
pipelineqc_scores = read.csv(pipeline_scores_file)
rownames(pipelineqc_scores) = as.character(pipelineqc_scores$vial_label)

Add the phenotypic data

#Add the phenotypic data
pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
                              remove_prev_files = F,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue = 
  pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze = 
  pheno_data$viallabel_data$calculated.variables.frozetime_after_train - 
  pheno_data$viallabel_data$calculated.variables.deathtime_after_train
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
  pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
  v = rep(0,length(x))
  tps = c("Eight"=8,"Four"=4,"One"=1,"Two"=2)
  for(tp in names(tps)){
    v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
  }
  return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
  pheno_data$viallabel_data$key.anirandgroup
)

sample_covs = pheno_data$viallabel_data[,biospec_cols]

Dataset preprocessing

We transform the normalized data into M values and quantile normalize the resulting matrix. These can be interpreted as the base2 logit transformation of the proportion of methylated signal at each locus.

tissue2Mmatrix = list()
for(tissue in names(tissue2rrbs_data)){
  y = tissue2rrbs_data[[tissue]]
  Me <- y$counts[, grepl("-Me",colnames(y$counts))]
  Un <- y$counts[, grepl("-Un",colnames(y$counts))]
  # The M-value can be interpreted as the base2 logit transformation of the
  # proportion of methylated signal at each locus:
  M <- log2(Me + 2) - log2(Un + 2)
  colnames(M) <- gsub("-Me","",colnames(M))
  # rownames(M) == y$genes$EntrezID # sanity check
  
  # MDS plot
  # In this plot the distance between each pair of samples represents the average logit
  # change between the samples for the top most differentially methylated
  # CpG loci between that pair of samples.
  sex = sample_covs[colnames(M),1]
  sex[sex==1] = "M";sex[sex==2] = "F"
  cols = rep("blue",length(sex))
  cols[sex=="M"] = "green"
  plotMDS(M,labels = sex,pch = sample_covs[colnames(M),1]+20,col=cols,
          main = paste("MDS plot before quantile",tissue))
  
  samp = sample(1:ncol(M))[1:20]
  samp2 = sample(1:nrow(M))[1:min(20000,nrow(M))]
  boxplot(M[samp2,samp],
          main=paste0(tissue,"\nbefore quantile (",nrow(M)," promoters)"),
          ylab = "log2 M values",xaxt="n")
  M = run_quantile_normalization(M)
  boxplot(M[samp2,samp],
          main=paste0(tissue,"\nafter quantile (",nrow(M)," promoters)"),
          ylab = "log2 M values",xaxt="n")
  tissue2Mmatrix[[tissue]] = M
  plotMDS(M,labels = sex,pch = sample_covs[colnames(M),1]+20,col=cols,
          main = paste("MDS plot after quantile",tissue))
}

MOP-flagged samples

The RRBS pipeline manual of procedures (MOP) defines a flagged sample (e.g., potentially problematic) as a sample that satisfies one of the following: (1) number of read pairs <20M, (2) Low number of uniquely mapped reads: pct_Uniq <50%, (3) mapped read count (pct_Uniq*reads) < 50% of the median mapped read count of all samples within a tissue (within a sequencing batch), (4) bisulfite conversion efficiency less than 95%, i.e lambda_pct_CpG>5%, but limited to samples with lambda_pct_Uniq>0.5%, (5)unexpected strands mapped: pct_OT<30% or pct_OB>70%; pct_CTOT>10% or pct_CTOB>10%, and (6) high duplications based on UMI: pct_umi_dup >20%.

pipelineqc_scores$pipeline_flags =  rrbs_pipeline_flagged_samples(pipelineqc_scores)
mop_flagged_samples = rownames(pipelineqc_scores)[nchar(pipelineqc_scores$pipeline_flags)>0]

PCA visualization (sex and group)

We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.

tissue2pca = list()
for(tissue in names(tissue2Mmatrix)){
  curr_data = tissue2Mmatrix[[tissue]]
  # remove vial labels that start with 8 (reference samples)
  curr_data = curr_data[,grepl("^9",colnames(curr_data),perl = T)]
  # remove zero variance rows
  curr_data = curr_data[apply(curr_data,1,sd)>0,]
    
  curr_pca = prcomp(t(curr_data),scale. = T,center = T,rank. = num_pcs)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]

  # plot
  df = data.frame(curr_pcax,
    randgroup = pheno_data$viallabel_data[rownames(curr_pcax),"key.anirandgroup"],
    sex = as.character(pheno_data$viallabel_data[rownames(curr_pcax),"registration.sex"]),
    stringsAsFactors = F
  )
  df$sex[df$sex=="1"] = "F"
  df$sex[df$sex=="2"] = "M"
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
      ggtitle(tissue)
  plot(p)
  tissue2pca[[tissue]] = df
}

PC-based association analysis

Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance.

For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.

pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(currname in names(tissue2Mmatrix)){
  curr_pcax = tissue2pca[[currname]]
  curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
  # get the metadata of the samples
  curr_meta = cbind(pipelineqc_scores[rownames(curr_pcax),pipeline_qc_cols],
                     pheno_data$viallabel_data[rownames(curr_pcax),biospec_cols])
  # remove metadata variables with NAs
  curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
  # remove bid and pid
  curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
  # remove the text group description
  curr_meta = curr_meta[!grepl("randgr",names(curr_meta))]
  # remove fields withe zero variance
  curr_meta = curr_meta[,apply(curr_meta,2,sd)>0]
  
  # Assumption here: all metadata values are numeric
  corrs = cor(curr_pcax,curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_pcax[rownames(curr_meta),],
    curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
    ggtitle(currname) +
    theme(plot.title = element_text(hjust = 0.5,size=20)))
  
  for(i in 1:nrow(corrsp)){
    for(j in 1:ncol(corrsp)){
      if(corrsp[i,j]>p_thr){next}
      pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
            c(currname,
              rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
            )
    }
  }
  colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
                                  "qc_metric","rho(spearman)","p-value")
  
  ####
  # compute correlations among the metadata variables
  ####
  corrs = cor(curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(corrs,lab=T,lab_size=1.5,hc.order = T))
  
  for(n1 in rownames(corrsp)){
    for(n2 in rownames(corrsp)){
      if(n1==n2){break}
      if(n1 %in% biospec_cols &&
         n2 %in% biospec_cols) {next}
      if(corrsp[n1,n2]>p_thr){next}
      metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
            c(currname,n1,n2,corrs[n1,n2],corrsp[n1,n2])
            )
    }
  }
  if(!is.null(dim(metadata_variable_assoc_report))){
    colnames(metadata_variable_assoc_report) = c(
      "Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
  }
}

# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
  as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
  as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
  as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
  as.numeric(metadata_variable_assoc_report[,4]),digits=3)

PCA outliers

In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.

Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples.

pca_outliers_report = c()
for(currname in names(tissue2pca)){
  curr_pcax = tissue2pca[[currname]]
  curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
  
  # Univariate: use IQRs
  pca_outliers = c()
  for(j in 1:ncol(curr_pcax)){
    outlier_values <- boxplot.stats(curr_pcax[,j],coef = OUTLIER_IQR_THR)$out
    for(outlier in names(outlier_values)){
      pca_outliers_report = rbind(pca_outliers_report,
       c(currname,paste("PC",j,sep=""),outlier,
         format(outlier_values[outlier],digits=5))
      )
      if(!is.element(outlier,names(pca_outliers))){
        pca_outliers[outlier] = outlier_values[outlier]
      }
    }
  }
  
  # Plot the outliers
  if(length(pca_outliers)>0){
    # print(length(kNN_outliers))
    df = data.frame(curr_pcax,
                outliers = rownames(curr_pcax) %in% names(pca_outliers))
    col = rep("black",nrow(df))
    col[df$outliers] = "green"
    plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC1",ylab="PC2")
    plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC3",ylab="PC4")
  }
}
if(!is.null(dim(pca_outliers_report))){
  colnames(pca_outliers_report) =  c("dataset","PC","sample","score")
}

Sex checks

We verify that the sex of a sample can be predicted from the percent of chromosome Y and percent of chromosome X reads. This is done automatically by training a logistic regression classifier, but we also plot the 2D plot for each tissue.

# sex checks - using simple logistic regression classifier (LOO-CV)
sex_read_info = cbind(pipelineqc_scores$pct_chrX,pipelineqc_scores$pct_chrY)
rownames(sex_read_info) = rownames(pipelineqc_scores)
sex_check_outliers = c()
for(currname in names(tissue2Mmatrix)){
  # 1 is female
  pca_df = tissue2pca[[currname]]
  read_info = sex_read_info[rownames(pca_df),]
  df = data.frame(sex=as.factor(pca_df$sex),
                  pct_chrX = read_info[,1],
                  pct_chrY = read_info[,2],stringsAsFactors = F)
  p = ggplot(df) + 
      geom_point(aes(x=pct_chrX, y=pct_chrY,col=sex, shape=sex)) +
      ggtitle(paste0(currname," - sex check"))
  plot(p)
  train_control <- trainControl(method = "cv", number = nrow(df),
                                savePredictions = TRUE)
  # train the model on training set
  model <- train(sex ~ .,data = df,
               trControl = train_control,
               method = "glm", family=binomial(link="logit"))
  # CV redults
  cv_res = model$pred
  cv_errors = cv_res[,1] != cv_res[,2]
  err_samples = rownames(df)[cv_res[cv_errors,"rowIndex"]]
  for(samp in err_samples){
    sex_check_outliers = rbind(sex_check_outliers,
                               c(currname,samp))
  }
}

if(! is.null(dim(sex_check_outliers))){
  colnames(sex_check_outliers) = c("dataset","sample")
}

Differential analysis

ftest_res = c() # keeps all ftest results
sex_ftest_res = c() # keeps the sex-specific f-tests
tpDA = c() # keep the timewise results
for(dataset_name in names(tissue2rrbs_data)){
  print(paste("Analyzing dataset:",dataset_name))
  y = tissue2rrbs_data[[dataset_name]]
  is_sample = grepl("^9",colnames(y),perl=T)
  y = y[,is_sample]
  curr_samps = sapply(colnames(y),function(x)strsplit(x,split="-")[[1]][1])
  
  # Parse the covariates
  covs = data.frame(
    sample = as.factor(curr_samps),
    Me = grepl("Me",colnames(y)),
    sex = pheno_data$viallabel_data[curr_samps,"registration.sex"],
    timepoint = pheno_data$viallabel_data[curr_samps,"timepoint"],
    is_control = pheno_data$viallabel_data[curr_samps,"is_control"]
  )
  covs$sex[covs$sex=="2"] = "M";covs$sex[covs$sex=="1"]="F"
  rownames(covs) = colnames(y)
  
  # Create design matrices for time-wise and the F tests 
  covs$group_sex_tp=factor(paste(covs$sex,covs$timepoint,covs$is_control,sep="_"))
  covs$group_tp=factor(paste(covs$timepoint,covs$is_control,sep="_"))
  samples_mat = model.matrix(~0+sample,data=covs)
  sex_tp_mat = model.matrix(~0+group_sex_tp,data=covs)
  tp_mat = model.matrix(~0+sex+group_tp+sex:group_tp,data=covs)
  sex_tp_mat[!covs$Me,] = 0
  tp_mat[!covs$Me,] = 0
  if(rankMatrix(cbind(sex_tp_mat,tp_mat))[1] != ncol(sex_tp_mat)){
    print(paste("Warning in",dataset_name,
                "check the design matrices, the two eq matrices have different ranks"))
  }
  
  des = cbind(samples_mat,sex_tp_mat)
  colnames(des) = gsub("group_sex_tp","",colnames(des))
  
  # A design matrix for the more general F tests
  full_model_str = "~0+sample+sex+group+sex:group"
  null_model_str = "~0+sample+sex"
  # (see https://support.bioconductor.org/p/12119/ and the limma guide)
  des2 = cbind(samples_mat,tp_mat)
  # define the contrasts for the analyses below
  C_ttests = makeContrasts(
    M_1_0 - M_8_1,M_2_0 - M_8_1,M_4_0 - M_8_1,M_8_0 - M_8_1,
    F_1_0 - F_8_1,F_2_0 - F_8_1,F_4_0 - F_8_1,F_8_0 - F_8_1,
    levels = des
  )
  print("Estimating dispersion for the first design matrix...")
  y1 <- estimateDisp(y, design=des,tol = edger_tol)
  print("Done")
  print("Running glmQLFit...")
  fit.ttest <- glmQLFit(y1,des)
  print("Done")

  # extract contrast info
  print("Extracting timewise results from the contrasts...")
  for(col in colnames(C_ttests)){
    curr_sex = strsplit(col,split="_")[[1]][1]
    sex_str = "male"
    if(curr_sex=="F"){sex_str="female"}
    curr_tp = strsplit(col,split="_")[[1]][2]

    print(paste("Fitting timewise model for:",col))
    res <- glmQLFTest(fit.ttest,contrast=C_ttests[,col])
    plotMD(res)
    # extract the results into a table
    edger_res <- topTags(res, n=Inf, p.value = 1,
                        adjust.method = "BH",sort.by = "none")$table
    # add z-scores
    edger_res$F[edger_res$F < 0] = 1e-10
    t.stat <- sign(edger_res$logFC) * sqrt(edger_res$F)
    z <- zscoreT(t.stat, df=res$df.total)
    curr_res = data.frame(
      feature_ID = rownames(y),
      edger_res[,1:4],
      assay = "epigen-rrbs",
      tissue = dataset_name,
      sex = sex_str,
      #logFC_se = TBD,
      logFC = edger_res$logFC,
      fscore = edger_res$F,
      zscore = z,
      covariates = NA,
      comparison_group = paste0(curr_tp,"w"),
      p_value = edger_res$PValue,
      adj_p_value = edger_res$FDR
    )
    # add the results
    tpDA = rbind(tpDA,curr_res)
  }
  print("Done")
  
  print("Estimating dispersion for the second design matrix...")
  y2 <- estimateDisp(y, design=des2,tol = edger_tol)
  print("Done")
  
  if(any(abs(y1$trended.dispersion-y2$trended.dispersion)>1e-05) ||
     any(abs(y1$tagwise.dispersion-y2$tagwise.dispersion)>1e-05)){
    print(paste("Warning in",dataset_name,
                "dispersion estimates are inconsistent, please check"))
  }
  
  print("Fitting the model for the F-tests...")
  fit.ftest <- glmQLFit(y2,des2)
  is_group_variable = grepl("group",colnames(des2))
  res <- glmQLFTest(fit.ftest,coef=colnames(des2)[is_group_variable])
  ftest_edger_res <- topTags(res, n=Inf, p.value = 1,
                       adjust.method = "BH",sort.by = "none")$table
  print("Done")
  
  # add the results
  curr_f_test_res = data.frame(
    feature_ID = rownames(y),
    ftest_edger_res[,1:4],
    assay = "epigen-rrbs",
    tissue = dataset_name,
    p_value = ftest_edger_res$PValue,
    adj_p_value = ftest_edger_res$FDR,
    fscore = ftest_edger_res$F,
    full_model = full_model_str,
    reduced_model = null_model_str
  )
  ftest_res = rbind(ftest_res,curr_f_test_res)
  #print(table(ftest_res$adj_p_value < 0.1))
  
  print("Fitting model for the sex-assoc tests...")
  is_interaction_variable = grepl("group",colnames(des2)) & grepl("sex",colnames(des2))
  res <- glmQLFTest(fit.ftest,coef=colnames(des2)[is_interaction_variable])
  sex_edger_res <- topTags(res, n=Inf, p.value = 1,
                       adjust.method = "BH",sort.by = "none")$table
  print("Done")
  
  # add the results
  curr_sex_f_test_res = data.frame(
    feature_ID = rownames(y),
    sex_edger_res[,1:4],
    assay = "epigen-rrbs",
    tissue = dataset_name,
    p_value = sex_edger_res$PValue,
    adj_p_value = sex_edger_res$FDR,
    fscore = sex_edger_res$F,
    full_model = full_model_str,
    reduced_model = "~1+sample+sex+group"
  )
  sex_ftest_res = rbind(sex_ftest_res,curr_sex_f_test_res)
  
  # free space, erase objects
  rm(edger_res);rm(sex_edger_res);rm(ftest_edger_res)
  rm(des);rm(des2)
  rm(y);rm(y1);rm(y2)
  gc()
}
## [1] "Analyzing dataset: t52-hippocampus"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t55-gastrocnemius"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t58-heart"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t59-kidney"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t66-lung"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t68-liver"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t69-brown-adipose"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t70-white-adipose"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"

## [1] "Fitting timewise model for: M_2_0 - M_8_1"

## [1] "Fitting timewise model for: M_4_0 - M_8_1"

## [1] "Fitting timewise model for: M_8_0 - M_8_1"

## [1] "Fitting timewise model for: F_1_0 - F_8_1"

## [1] "Fitting timewise model for: F_2_0 - F_8_1"

## [1] "Fitting timewise model for: F_4_0 - F_8_1"

## [1] "Fitting timewise model for: F_8_0 - F_8_1"

## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"

Print the results to files:

out_bucket = dea_out_bucket
suffix = "20210503.txt"
# timewise tables
for(tissue in unique(tpDA$tissue)){
  currDA = tpDA[tpDA$tissue == tissue,]
  local_fname = paste0(local_data_dir,
                       "pass1b-06_",tissue,"_epigen-rrbs_timewise-dea_",suffix)
  write.table(currDA,file=local_fname,sep="\t",quote=F,
              row.names = F,col.names = T)
  cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
  system(cmd)
  system(paste("rm",local_fname))
}
# f-test tables
for(tissue in unique(ftest_res$tissue)){
  currTable = ftest_res[ftest_res$tissue == tissue,]
  local_fname = paste0(local_data_dir,
                       "pass1b-06_",tissue,"_epigen-rrbs_training-dea_",suffix)
  write.table(currTable,file=local_fname,sep="\t",quote=F,
              row.names = F,col.names = T)
  cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
  system(cmd)
  system(paste("rm",local_fname))
}
# sex-based results
for(tissue in unique(sex_ftest_res$tissue)){
  currTable = sex_ftest_res[sex_ftest_res$tissue == tissue,]
  local_fname = paste0(local_data_dir,
                       "pass1b-06_",tissue,"_epigen-rrbs_training-sex-biased_",suffix)
  write.table(currTable,file=local_fname,sep="\t",quote=F,
              row.names = F,col.names = T)
  cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
  system(cmd)
  system(paste("rm",local_fname))
}